# Load in data
hires_2019_clean <- read_csv('../clean_data/hires_2019_clean.csv')
Rows: 124446 Columns: 19
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): start_station_name, start_station_description, end_station_name, end_station_description
dbl (11): duration, start_station_id, start_station_latitude, start_station_longitude, end_station_id, end_station...
date (2): start_date, end_date
time (2): start_time, end_time
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rain_2019_clean <- read_csv('../clean_data/rain_2019_clean.csv')
Rows: 365 Columns: 2
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (1): rainfall_mm
date (1): start_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# visualising use by month
hires_2019_clean %>%
mutate(month = month(start_date, label = TRUE), .before = 1) %>%
group_by(month) %>%
summarise(count = n()) %>%
ggplot() +
aes(x = month, y = count) +
geom_col(fill = "#F27F1B") +
geom_text(aes(label = count), vjust = 2, colour = "white", size = 3.5) +
labs(x = "\nMonth",
y = "Number of journeys\n",
title = "Total number of bike journeys by month (2019)",
subtitle = "Total = 124446 journeys\n\n") +
scale_y_discrete(expand = c(0,1), limits = c(0, 5000, 10000, 15000)) +
theme_minimal() +
theme(title = element_text(size = 12),
axis.text = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
Mapping out stations
# create smaller dataset for map
map_data <- hires_2019_clean %>%
select(start_station_longitude, start_station_latitude, start_station_id, start_station_name, start_elevation) %>%
distinct(start_station_id, .keep_all = TRUE)
# create custom icon for bike hire stations
bike <- makeIcon("../www/bicycle-outline.png")
# map station data using leaflet app
map_data %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addTiles(providers$CartoDB.Positron) %>%
addMarkers(~start_station_longitude, ~start_station_latitude,
icon = ~bike,
#clusterOptions = markerClusterOptions(),
popup = ~paste0("Station ID: ", start_station_id,
"<br>Name: ", start_station_name,
"<br>Elevation (m): ", start_elevation))
# addCircles(lng = ~start_station_longitude,
# lat = ~start_station_latitude,
# color = "#F27F1B",
# popup = ~paste0("Station ID: ", start_station_id,
# "<br>Name: ", start_station_name,
# "<br>Elevation (m): ", start_elevation))
Most popular stations
# top 10 start and end stations
hires_2019_clean %>%
select(start_station_id, start_station_name, start_station_description, start_elevation) %>%
count(start_station_id, start_station_name, start_station_description, start_elevation) %>%
arrange(desc(n)) %>%
head(10)
hires_2019_clean %>%
select(end_station_id, end_station_name, end_station_description, end_elevation) %>%
count(end_station_id, end_station_name, end_station_description, end_elevation) %>%
arrange(desc(n)) %>%
head(10)

# map most popular stations data using leaflet app
map_data %>%
filter(start_station_id %in% c("248", "259", "265", "289", "257",
"249", "247", "171", "183", "262",
"250", "358", "258")) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addTiles(providers$CartoDB.Positron) %>%
addMarkers(~start_station_longitude, ~start_station_latitude,
icon = ~bike,
popup = ~paste0("Station ID: ", start_station_id,
"<br>Name: ", start_station_name,
"<br>Elevation (m): ", start_elevation))
NA
map_data
Error: object 'map_data' not found
hires_2019_clean %>%
select(start_station_id, end_station_id) %>%
filter(start_station_id == c("248", "259", "265", "289", "257", "262", "249", "247", "171", "183"),
end_station_id == c("262", "257", "250", "265", "248", "358", "259", "183", "171", "258")) %>%
chordDiagram(scale = TRUE)
hires_2019_clean %>%
select(start_station_id, end_station_id) %>%
filter(start_station_id == c("248", "259", "265", "289", "257", "262", "249", "247", "171", "183"),
end_station_id == c("262", "257", "250", "265", "248", "358", "259", "183", "171", "258")) %>%
ggraph(layout = "linear") +
geom_edge_arc(edge_colour = "black", edge_alpha = 0.3, edge_width = 0.2) +
geom_node_point(color = "#F27F1B", size = 10) +
geom_node_text(aes(label = name), repel = FALSE, size = 3, nudge_y = 0, colour = "white") +
labs(title = "Journeys between most popular stations\n\n") +
theme_void() +
theme(title = element_text(size = 12),
legend.position = "none",
plot.margin = unit(rep(1, 4), "cm"))
Journeys based on elevation
68622 downhill journeys 43774 uphill journeys 12050 flat journeys
12050 / 124446 * 100
[1] 9.682915
get_elev_raster()
Journeys based on weather
According to www.statista.com “A rainday is when one millimetre or
more of rain occurs in a day.” Based on this I have classified rainy
days as days where rainfall as recorded by www.power.larc.nasa.gov/ was
equal to or over 1mm in Edinburgh.

Hypothesis Test
# calculate the p-value and compare to α value
p_value <- null_distribution %>%
get_p_value(obs_stat = observed_stat, direction = "left")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation based on the number of `reps` chosen in the `generate()` step. See `?get_p_value()` for more information.
# hypothesis test
# α = 0.05
# H0 - Rain makes no difference to bike hires
# H1 - Rain has an impact on bike hires
# Type of test is one-sample proportion, right-sided.
#
# p value = 0
#
# The p-value being near to 0 means it is less than α, and so we should reject H0 based on our data. The result is statistically significant in favour of H1, that rain may impact bike hire numbers.
# rain_day <- hires_2019_clean %>%
# group_by(start_date) %>%
# mutate(rain_day = rainfall_mm >= 1) %>%
# select(start_date, rain_day) %>%
# count(start_date, rain_day)
#
# null_distribution <- rain_day %>%
# specify(response = rain_day, success = "TRUE") %>%
# hypothesize(null = "point", p = 0.05) %>%
# generate(reps = 10000, type = "draw") %>%
# calculate(stat = "prop")
#
# obs_stat <- rain_day %>%
# specify(response = rain_day, success = "TRUE") %>%
# calculate(stat = "prop")
#
# null_distribution %>%
# visualise() +
# shade_p_value(direction = "right", obs_stat = obs_stat)
#
# null_distribution %>%
# get_p_value(direction = "right", obs_stat = obs_stat)
# bootstrapped hypothesis
# α = 0.05
# H0 - Rain makes no difference to bike hires
# H1 - Rain has an impact on bike hires
# rain_day_flag <- rain_day %>%
# mutate(rain_day_flag = if_else(rain_day == "TRUE", 1, 0))
#
# null_distribution <- rain_day_flag %>%
# specify(response = rain_day_flag) %>%
# hypothesize(null = "point", mu = 0.05) %>%
# generate(reps = 10000, type = "bootstrap") %>%
# calculate(stat = "mean")
#
# null_distribution %>%
# visualise() +
# shade_p_value(direction = "right", obs_stat = obs_stat)
#
# null_distribution %>%
# get_p_value(direction = "right", obs_stat = obs_stat)
Independence Hypothesis Test On Rainy day data
Does a rainy day have an effect on bike hires
α = 0.05 H0 - Rain makes no difference to bike hires H1 - Rain has an
impact on bike hires
The p-value ≤ α, so I reject the null hypothesis H0 in favour of the
alternative hypothesis H1, that rain may have an effect on bike
hires.
Rainy Days in 2019 = 181 Rainy journeys = 58073 (46.7%) Non rainy
journeys = 66373 (53.3%)
66373 / 124446 * 100
[1] 53.33478
Time of Day analysis
Gym Bunnies - before 7am: 7783 23.07 mins
Morning Commuters - 7am to 9am: 12452 17.87 mins
Day Trippers - 9am to 5pm: 66880 29.75 mins
Homeward Bounders - 5pm to 6.30pm: 14961 23.86 mins
Evening Movers - 6.30pm to 10pm: 16815 23.5 mins
Pub Ponies - 10pm to Midnight: 5555 20.67 mins
Average journey time overall is 26.18 minutes
# Gym Bunnies - before 7am
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time <= "07:00:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Morning Commuters - 7am to 9am
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time > "07:00:00", start_time <= "09:00:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Day Trippers - 9am to 5pm
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time > "09:00:00", start_time < "17:00:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Homeward Bounders - 5pm to 6.30pm
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time >= "17:00:00", start_time <= "18:30:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Evening Movers - 6.30pm to 10pm
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time > "18:30:00", start_time < "22:00:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Pub Ponies - 10pm to Midnight
hires_2019_clean %>%
mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>%
filter(start_time >= "22:00:00") %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
# Average journey length
hires_2019_clean %>%
mutate(mean_duration = round(mean(duration), 2)) %>%
count(mean_duration)
NA
hires_2019_clean %>%
select(start_time) %>%
group_by(start_time) %>%
count() %>%
ggplot() +
geom_line(aes(start_time, n), col = "#F27F1B") +
labs(x = "\nTime of Day",
y = "Number of Journeys\n",
title = "Start time of journeys") +
theme_minimal() +
theme(title = element_text(size = 12),
axis.text = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))

# trying a raster image of elevation
ggplot() +
geom_raster(data = new_elevation_raster, aes(x = x, y = y)) +
geom_sf(data = elevation_raster, color = "white") +
coord_sf() +
scale_fill_viridis_c() +
labs(title = " ", x = " ", y = " ", fill = " ")
---
title: "Final Project workbook"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

```{r include=FALSE}
# load libraries
library(tidyverse)
library(janitor)
library(here)
library(lubridate)
library(leaflet)
library(elevatr)
library(rgdal)
library(circlize)
library(ggraph)
library(infer)
library(sf)
library(rgeoboundaries)
library(mapdeck)

```

```{r message=FALSE}
# Load in data
hires_2019_clean <- read_csv('../clean_data/hires_2019_clean.csv')
rain_2019_clean <- read_csv('../clean_data/rain_2019_clean.csv')


```



```{r}
# number of stations in total: 164
hires_2019_clean %>% 
  distinct(start_station_id) %>% 
  count()

hires_2019_clean %>% 
  distinct(end_station_id) %>% 
  count()
```


```{r}
# visualising use by month
hires_2019_clean %>% 
  mutate(month = month(start_date, label = TRUE), .before = 1) %>% 
  group_by(month) %>% 
  summarise(count = n()) %>%
  ggplot() +
  aes(x = month, y = count) +
  geom_col(fill = "#F27F1B") +
  geom_text(aes(label = count), vjust = 2, colour = "white", size = 3.5) +
  labs(x = "\nMonth",
       y = "Number of journeys\n",
       title = "Total number of bike journeys by month (2019)",
       subtitle = "Total = 124446 journeys\n\n") +
  scale_y_discrete(expand = c(0,1), limits = c(0, 5000, 10000, 15000)) +
  theme_minimal() +
  theme(title = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"))

```

```{r}
# visualise use by day of the week

hires_2019_clean %>% 
  mutate(day = wday(start_date, label = TRUE), .before = 1) %>% 
  group_by(day) %>% 
  summarise(count = n()) %>%
  ggplot() +
  aes(x = day, y = count) +
  geom_col(fill = "#F27F1B") +
  geom_text(aes(label = count), vjust = 2, colour = "white", size = 3.5) +
  labs(x = "\nDay of week",
       y = "Number of journeys\n",
       title = "Total number of bike journeys by day of week (2019)") +
  scale_y_discrete(expand = c(0,1), limits = c(0, 5000, 10000, 15000)) +
  theme_minimal() +
  theme(title = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"))
```


### Mapping out stations

```{r}
# create smaller dataset for map
map_data <- hires_2019_clean %>% 
  select(start_station_longitude, start_station_latitude, start_station_id, start_station_name, start_elevation) %>% 
  distinct(start_station_id, .keep_all = TRUE)
```


```{r}

# create custom icon for bike hire stations
bike  <-  makeIcon("../www/bicycle-outline.png")

# map station data using leaflet app
map_data %>% 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addTiles(providers$CartoDB.Positron) %>%
  addMarkers(~start_station_longitude, ~start_station_latitude, 
             icon = ~bike,
             #clusterOptions = markerClusterOptions(),
             popup = ~paste0("Station ID: ", start_station_id,
                              "<br>Name: ", start_station_name,
                              "<br>Elevation (m): ", start_elevation))

  # addCircles(lng = ~start_station_longitude,
  #            lat = ~start_station_latitude,
  #            color = "#F27F1B",
  #            popup = ~paste0("Station ID: ", start_station_id,
  #                            "<br>Name: ", start_station_name,
  #                            "<br>Elevation (m): ", start_elevation))
```

### Most popular stations


```{r}
# top 10 start and end stations
hires_2019_clean %>% 
  select(start_station_id, start_station_name, start_station_description, start_elevation) %>% 
  count(start_station_id, start_station_name, start_station_description, start_elevation) %>% 
  arrange(desc(n)) %>% 
  head(10)

hires_2019_clean %>% 
  select(end_station_id, end_station_name, end_station_description, end_elevation) %>% 
  count(end_station_id, end_station_name, end_station_description, end_elevation) %>% 
  arrange(desc(n)) %>%
  head(10)
```


```{r}
# round trip vs a to b journey
hires_2019_clean %>% 
  filter(start_station_id == end_station_id) %>% 
  count()

11944 / 124446 * 100 # 9.6% of journeys were round trips

# most popular round trip stations
hires_2019_clean %>% 
  filter(start_station_id == end_station_id) %>% 
  count(start_station_id) %>% 
  arrange(desc(n))

# visualise round trip vs one ways
hires_2019_clean %>% 
  mutate(round_trips = start_station_id == end_station_id) %>% 
  count(round_trips) %>% 
  ggplot() +
  aes(round_trips, n) +
  geom_col(fill = "#F27F1B") +
  labs(x = "\nOne Way vs Round Trip",
       y = "Number of journeys\n",
       title = "One Way vs Round Trip journeys taken") +
  theme_minimal() +
  theme(title = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"))

```


```{r}
# map most popular stations data using leaflet app
map_data %>% 
  filter(start_station_id %in% c("248", "259", "265", "289", "257", 
                                 "249", "247", "171", "183", "262", 
                                 "250", "358", "258")) %>%
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addTiles(providers$CartoDB.Positron) %>%
  addMarkers(~start_station_longitude, ~start_station_latitude, 
             icon = ~bike,
             popup = ~paste0("Station ID: ", start_station_id,
                             "<br>Name: ", start_station_name,
                             "<br>Elevation (m): ", start_elevation))

```

```{r}
map_data %>% 
  filter(start_station_id %in% c("248", "259", "265", "289", "257", 
                                 "249", "247", "171", "183", "262", 
                                 "250", "358", "258")) %>% 
  ggplot(label = start_station_id) +
  geom_point(aes(start_station_id, start_elevation)) +
  theme_bw() +
  geom_text(aes(start_station_id, start_elevation, label = start_elevation, hjust=-0.2, vjust=0))

```

```{r}
library(mapdeck)

set_token("pk.eyJ1IjoiamI3NCIsImEiOiJjbDN4Nm52OGMwZ3pnM2NxdHhlMThxanA0In0.iFNcOguU6rkUQEhGruqSBg")  ## set your mapbox token here
#set_token("pk.eyJ1IjoiamI3NCIsImEiOiJjbDN4Nm52OGMwZ3pnM2NxdHhlMThxanA0In0.iFNcOguU6rkUQEhGruqSBg")  ## set your mapbox token here

#mapdeck_tokens()

hires_2019_clean %>% 
filter(start_station_id %in% c("248")) %>% 
  na.omit() %>% 
mapdeck(style = mapdeck_style('dark'), pitch = 45,
        location = c(55.942, -3.199))%>%
  add_arc(
      layer_id = 'arc_layer'
    , origin = c("start_station_longitude", "start_station_latitude")
    , destination = c("end_station_longitude", "end_station_latitude")
    , stroke_from_opacity = 100
    , stroke_to_opacity = 100
    , stroke_width = 3
    , stroke_from = "#F27F1B"
    , stroke_to = "#F27F1B"
  )

#"259", "265", "289", "257", "249", "247", "171", "183", "262", "250", "358", "258"
```


```{r}
hires_2019_clean %>% 
  select(start_station_id, end_station_id) %>% 
  filter(start_station_id == c("248", "259", "265", "289", "257", "262", "249", "247", "171", "183"), 
         end_station_id == c("262", "257", "250", "265", "248", "358", "259", "183", "171", "258")) %>% 
  chordDiagram(scale = TRUE)
  
```

```{r}
# Journeys ending at Victoria Quay
hires_2019_clean %>% 
  filter(end_station_id == "250") %>% 
  count()

# Journeys ending at Victoria Quay, by start station
hires_2019_clean %>% 
  filter(end_station_id == "250") %>% 
  count(start_station_id) %>% 
  arrange(desc(n))

# Journeys ending at George Square, by start station
hires_2019_clean %>% 
  filter(end_station_id == "171") %>% 
  count(start_station_id) %>% 
  arrange(desc(n))

hires_2019_clean %>% 
  filter(start_station_id == "259") %>% 
  count(end_station_id) %>% 
  arrange(desc(n))
```



```{r}
hires_2019_clean %>% 
  select(start_station_id, end_station_id) %>%
  filter(start_station_id == c("248", "259", "265", "289", "257", "262", "249", "247", "171", "183"), 
         end_station_id == c("262", "257", "250", "265", "248", "358", "259", "183", "171", "258")) %>% 
  ggraph(layout = "linear") + 
  geom_edge_arc(edge_colour = "black", edge_alpha = 0.3, edge_width = 0.2) +
  geom_node_point(color = "#F27F1B", size = 10) +
  geom_node_text(aes(label = name), repel = FALSE, size = 3, nudge_y = 0, colour = "white") +
  labs(title = "Journeys between most popular stations\n\n") +
  theme_void() +
  theme(title = element_text(size = 12),
        legend.position = "none",
        plot.margin = unit(rep(1, 4), "cm"))
```


### Journeys based on elevation

68622 downhill journeys
43774 uphill journeys
12050 flat journeys

```{r}
# downhill vs uphill vs flat journeys
# based on elevation of start and end stations

# 68622 downhill journeys 55%
hires_2019_clean %>% 
  filter(elevation_diff < 0) %>% 
  count()

68622 / 124446 * 100

# 43774 uphill journeys 35%
hires_2019_clean %>% 
  filter(elevation_diff > 0) %>% 
  count()

43774 / 124446 * 100

# 12050 flat journeys 9.7%
hires_2019_clean %>% 
  filter(elevation_diff == 0) %>% 
  count()

12050 / 124446 * 100
```
```{r}
get_elev_raster()
```



### Journeys based on weather

According to www.statista.com "A rainday is when one millimetre or more of rain occurs in a day." Based on this I have classified rainy days as days where rainfall as recorded by www.power.larc.nasa.gov/ was equal to or over 1mm in Edinburgh.


```{r}
hires_2019_clean %>% 
  select(start_date, rainfall_mm) %>% 
  ggplot() +
  geom_line(aes(start_date, rainfall_mm), col = "blue") +
  geom_hline(yintercept = 1, col="#F27F1B") +
  labs(x = "\nDate",
       y = "Rainfall (mm)\n",
       title = "Edinburgh's daily rainfall in 2019",
       subtitle = "(Orange line at 1mm, above which we class as rainy day)\n") +
  theme_minimal() +
  theme(title = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"))
```

### Hypothesis Test

```{r}
# Independence Hypothesis
# α = 0.05
# H0 - uphill journeys makes no difference to bike hires
# H1 - uphill journeys have an impact on bike hires

# organise the data so I have an uphill column with a logical output TRUE/FALSE if the elevation change was positive
# also count the number of journeys that occurred on each day
uphill <- hires_2019_clean %>%
  group_by(start_date) %>% 
  mutate(uphill = elevation_diff > 0) %>%
  select(start_date, uphill) %>% 
  count(start_date, uphill)

# create a null distribution permuting the True and False values on whether the journey was uphill or not
# do 10k reps and check the difference in means of the groups
null_distribution <- uphill %>% 
  specify(n ~ uphill) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 10000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("TRUE", "FALSE")) 

# create an observed stat
observed_stat <- uphill %>% 
  specify(n ~ uphill) %>%
  calculate(stat = "diff in means", order = c("TRUE", "FALSE")) 

# plot observed stat on the null distribution
null_distribution %>%
  visualise() +
  shade_p_value(obs_stat = observed_stat, direction = "left")

# calculate the p-value and compare to α value
p_value <- null_distribution %>%
  get_p_value(obs_stat = observed_stat, direction = "left")

p_value
```


```{r}
# hypothesis test
# α = 0.05
# H0 - Rain makes no difference to bike hires
# H1 - Rain has an impact on bike hires
# Type of test is one-sample proportion, right-sided.
# 
# p value = 0 
# 
# The p-value being near to 0 means it is less than α, and so we should reject H0 based on our data. The result is statistically significant in favour of H1, that rain may impact bike hire numbers.

# rain_day <- hires_2019_clean %>%
#   group_by(start_date) %>% 
#   mutate(rain_day = rainfall_mm >= 1) %>% 
#   select(start_date, rain_day) %>% 
#   count(start_date, rain_day)
# 
# null_distribution <- rain_day %>%
#   specify(response = rain_day, success = "TRUE") %>%
#   hypothesize(null = "point", p = 0.05) %>%
#   generate(reps = 10000, type = "draw") %>%
#   calculate(stat = "prop")
# 
# obs_stat <- rain_day %>%
#   specify(response = rain_day, success = "TRUE") %>%
#   calculate(stat = "prop")
# 
# null_distribution %>%
#   visualise() +
#   shade_p_value(direction = "right", obs_stat = obs_stat)
# 
# null_distribution %>%
#   get_p_value(direction = "right", obs_stat = obs_stat)

```

```{r}
# bootstrapped hypothesis
# α = 0.05
# H0 - Rain makes no difference to bike hires
# H1 - Rain has an impact on bike hires

# rain_day_flag <- rain_day %>%
#   mutate(rain_day_flag = if_else(rain_day == "TRUE", 1, 0))
# 
# null_distribution <- rain_day_flag %>%
#   specify(response = rain_day_flag) %>%
#   hypothesize(null = "point", mu = 0.05) %>%
#   generate(reps = 10000, type = "bootstrap") %>%
#   calculate(stat = "mean")
# 
# null_distribution %>%
#   visualise() +
#   shade_p_value(direction = "right", obs_stat = obs_stat)
# 
# null_distribution %>%
#   get_p_value(direction = "right", obs_stat = obs_stat)
```

### Independence Hypothesis Test On Rainy day data

Does a rainy day have an effect on bike hires

α = 0.05
H0 - Rain makes no difference to bike hires
H1 - Rain has an impact on bike hires

The p-value ≤ α, so I reject the null hypothesis H0 in favour of the alternative hypothesis H1, 
that rain _may_ have an effect on bike hires.

```{r}
# Independence Hypothesis
# α = 0.05
# H0 - Rain makes no difference to bike hires
# H1 - Rain has an impact on bike hires

# organise the data so I have a rain_day column with a logical output TRUE/FALSE if it rained that day
# also count the number of journeys that occurred on each day
rain_day <- hires_2019_clean %>%
  group_by(start_date) %>% 
  mutate(rain_day = rainfall_mm >= 1) %>% 
  select(start_date, rain_day) %>% 
  count(start_date, rain_day)

# create a null distribution permuting the True and False values on whether it rained
# do 10k reps and check the difference in means of the groups
null_distribution <- rain_day %>% 
  specify(n ~ rain_day) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 10000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("TRUE", "FALSE")) 

# create an observed stat
observed_stat <- rain_day %>% 
  specify(n ~ rain_day) %>%
  calculate(stat = "diff in means", order = c("TRUE", "FALSE")) 

# plot observed stat on the null distribution
null_distribution %>%
  visualise() +
  shade_p_value(obs_stat = observed_stat, direction = "left")

# calculate the p-value and compare to α value
p_value <- null_distribution %>%
  get_p_value(obs_stat = observed_stat, direction = "left")

p_value

```

Rainy Days in 2019 = 181
Rainy journeys = 58073 (46.7%)
Non rainy journeys = 66373 (53.3%)

```{r}
# count the number of days with rainfall equal to or over 1mm
rain_2019_clean %>% 
  filter(rainfall_mm >= 1) %>% 
  count()

# journeys based on rainfall
# based on daily precipitation
hires_2019_clean %>% 
  filter(rainfall_mm >= 1) %>% 
  count()

58073 / 124446 * 100

hires_2019_clean %>% 
  filter(rainfall_mm < 1) %>% 
  count()

66373 / 124446 * 100
```


### Time of Day analysis

Gym Bunnies - before 7am:           7783   23.07 mins   
Morning Commuters - 7am to 9am:     12452  17.87 mins    
Day Trippers - 9am to 5pm:          66880  29.75 mins    
Homeward Bounders - 5pm to 6.30pm:  14961  23.86 mins    
Evening Movers - 6.30pm to 10pm:    16815  23.5 mins    
Pub Ponies - 10pm to Midnight:      5555   20.67 mins   

Average journey time overall is 26.18 minutes

```{r}
# Gym Bunnies - before 7am
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time <= "07:00:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Morning Commuters - 7am to 9am
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time > "07:00:00", start_time <= "09:00:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Day Trippers - 9am to 5pm
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time > "09:00:00", start_time < "17:00:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Homeward Bounders - 5pm to 6.30pm
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time >= "17:00:00", start_time <= "18:30:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Evening Movers - 6.30pm to 10pm
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time > "18:30:00", start_time < "22:00:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Pub Ponies - 10pm to Midnight
hires_2019_clean %>% 
  mutate(start_time = as.character(start_time), end_time = as.character(end_time)) %>% 
  filter(start_time >= "22:00:00") %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

# Average journey length
hires_2019_clean %>% 
  mutate(mean_duration = round(mean(duration), 2)) %>% 
  count(mean_duration)

```


```{r}
hires_2019_clean %>% 
  select(start_time) %>%
  group_by(start_time) %>% 
  count() %>%
  ggplot() +
  geom_line(aes(start_time, n), col = "#F27F1B") +
  labs(x = "\nTime of Day",
       y = "Number of Journeys\n",
       title = "Start time of journeys") +
  theme_minimal() +
  theme(title = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"))
```

```{r}
# trying a raster image of elevation
ggplot() +
  geom_raster(data = new_elevation_raster, aes(x = x, y = y)) +
  geom_sf(data = elevation_raster, color = "white") +
  coord_sf() +
  scale_fill_viridis_c() +
  labs(title = " ", x = " ", y = " ", fill = " ")
```

